1 Aim of the analysis

Analyse transcript level data from 5’ sensing comparisons and do some exploratory data analysis.

2 DESeq2 analysis

Read in data: htseq-count files created with usegalaxy.eu

htseq_CDS <- read.delim("input/Galaxy151-[CDS_htseqCount].tabular", header=TRUE, row.names=1)
htseq_TUs <- read.delim("input/Galaxy154-[TU_htseqCount].tabular", header=TRUE, row.names=1)
zuordnung <- read.delim("input/FileNames_Galaxy-transcript-history.csv", header=TRUE)
row.names(zuordnung) <- names(htseq_CDS)[c(2,3,1,8,7,6,5,4)]
names(htseq_CDS) <- zuordnung[names(htseq_CDS),]$strain
names(htseq_TUs) <- zuordnung[names(htseq_TUs),]$strain
coldata <- read.csv("input/colData_transcript.csv", row.names=1)
htseq_CDS <- htseq_CDS[,row.names(coldata)]
htseq_TUs <- htseq_TUs[,row.names(coldata)]

Do actual analysis

# create DESeq2 data object
ddsMat_CDS <- DESeqDataSetFromMatrix(countData = htseq_CDS,
                                 colData = coldata,
                                 design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_TUs <- DESeqDataSetFromMatrix(countData = htseq_TUs,
                                 colData = coldata,
                                 design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# run DESeq
ddsMat_CDS <- DESeq(ddsMat_CDS) 
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsMat_TUs <- DESeq(ddsMat_TUs) 
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
write.csv(data.frame(factor=ddsMat_CDS$sizeFactor), file="output/transcript_sizeFactors.csv")
write.csv(data.frame(counts(ddsMat_CDS, normalized=TRUE)), file="output/Transcript_CDS_normalizedCounts.csv")

2.1 Diagnostic Plots

plotDispEsts(ddsMat_CDS, main="CDS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")

plotDispEsts(ddsMat_TUs, main="TU comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")

pdf(file="output/DESeq2_Plots/CDS/ddsMat_CDS_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_CDS, main="CDS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png 
##   2
pdf(file="output/DESeq2_Plots/TU/ddsMat_TU_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_TUs, main="TU comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png 
##   2
p <- PCA_plot(ddsMat_CDS, "RNA Features")
p 

ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS_PCA.pdf", plot=p, width=9, height=9, units="cm")

p <- PCA_plot(ddsMat_TUs, "TU")
p

ggsave("output/DESeq2_Plots/TU/ddsMat_TU_PCA.pdf", plot=p, width=9, height=9, units="cm")

p <- heatmap_plot(ddsMat_CDS, "RNA Features")
p

ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS_heatMap.pdf", plot=p, width=15, height=12, units="cm")

p <- heatmap_plot(ddsMat_TUs, "TU")
p

ggsave("output/DESeq2_Plots/TU/ddsMat_TU_heatMap.pdf", plot=p, width=15, height=12, units="cm")

2.2 Use size factors to create .grp files

if(FALSE){
  transcript_coverage <- read.delim("input/transcript_coverage_combined_5sensing.txt", header=TRUE, row.names=1)
  # create DESeq2 data object
  ddsMat_transc <- DESeqDataSetFromMatrix(countData = transcript_coverage,
                                          colData = coldata,
                                          design = ~ strain)
  ddsMat_transc$sizeFactor <- ddsMat_CDS$sizeFactor
  transc_norm_counts <- counts(ddsMat_transc, normalized=TRUE)
  rm(ddsMat_transc)
  
  grp_File <- data.frame("WT"=apply(transc_norm_counts[,c("WT1", "WT2", "WT3")],1,mean), "dWT"=apply(transc_norm_counts[,c("dWT1", "dWT2", "dWT3")],1,mean), "TV"=apply(transc_norm_counts[,c("TV1", "TV2")],1,mean))
  grp_File$seqname <- rep(NA, length(grp_File$WT))
  seqnames <- c("BA000022.2", "AP004310.1", "AP004311.1", "AP004312.1", "AP006585.1")
  for(i in seqnames){
    grp_File[which(grepl(i, row.names(grp_File))),]$seqname <- i
  }
  grp_File$strand <- rep(NA, length(grp_File$WT))
  for(i in c("plus", "minus")){
    grp_File[which(grepl(i, row.names(grp_File))),]$strand <- i
  }
  for(i in seqnames){
    for(j in c("plus", "minus")){
      tmp <- subset(grp_File, grp_File$seqname==i & grp_File$strand==j)[,c("WT", "dWT", "TV")]
      s=""
      if(j=="plus"){
        s="fw"
      }else{
        s="rev"
      }
      write.table(tmp, file=paste("output/grp/transcript/", i, "_transcript_WT-dWT-TV_", s, ".grp", sep=""), sep="\t", row.names=FALSE, col.names = FALSE)
    }
  }
}

2.3 Extract Results

Extract results and use changeAnnotation_DESeq2.py and annotation_locusTags_stand13012021.csv to add annotation to tables.

# extract results
CDS_result_dWT_TV <- results(ddsMat_CDS, contrast=c("strain", "dWT", "TV")) # dWT/TV -> higher in dWT: higher log2FC
write_tsv(rownames_to_column(as.data.frame(CDS_result_dWT_TV[order(CDS_result_dWT_TV$padj),])), file="output/DESeq2_resultsTables/results_CDS-dWT-TV.tsv")

CDS_result_dWT_WT <- results(ddsMat_CDS, contrast=c("strain", "dWT", "WT")) # dWT/WT -> higher in dWT: higher log2FC
write_tsv(rownames_to_column(as.data.frame(CDS_result_dWT_WT[order(CDS_result_dWT_WT$padj),])), file="output/DESeq2_resultsTables/results_CDS-dWT-WT.tsv")

TUs_result_dWT_TV <- results(ddsMat_TUs, contrast=c("strain", "dWT", "TV")) # dWT/TV -> higher in dWT: higher log2FC
write_tsv(rownames_to_column(as.data.frame(TUs_result_dWT_TV[order(TUs_result_dWT_TV$padj),])), file="output/DESeq2_resultsTables/results_TUs-dWT-TV.tsv")

TUs_result_dWT_WT <- results(ddsMat_TUs, contrast=c("strain", "dWT", "WT")) # dWT/WT -> higher in dWT: higher log2FC
write_tsv(rownames_to_column(as.data.frame(TUs_result_dWT_WT[order(TUs_result_dWT_WT$padj),])), file="output/DESeq2_resultsTables/results_TUs-dWT-WT.tsv")

Execute in directory “DESeq2_resultsTables”: python changeAnnotation_DESeq2.py results_CDS-dWT-TV.tsv results_CDS-dWT-TV_annotated.tsv python changeAnnotation_DESeq2.py results_CDS-dWT-WT.tsv results_CDS-dWT-WT_annotated.tsv

python TUs_add_info.py results_TUs-dWT-TV.tsv results_TUs-dWT-TV_annotated.tsv python TUs_add_info.py results_TUs-dWT-WT.tsv results_TUs-dWT-WT_annotated.tsv

2.4 p-Value plots for different comparisons

pvaluePlot(CDS_result_dWT_TV, "CDS rne(WT)-rne(5p)")

pvaluePlot(CDS_result_dWT_WT, "CDS rne(WT)-WT")

pvaluePlot(TUs_result_dWT_TV, "rne(WT)-rne(5p)")

pvaluePlot(TUs_result_dWT_WT, "TUs rne(WT)-WT")

## png 
##   2
## png 
##   2
## png 
##   2
## png 
##   2

2.5 Filter for non-NA values

CDS_result_dWT_TV <- subset(CDS_result_dWT_TV, !is.na(CDS_result_dWT_TV$padj))
nrow(CDS_result_dWT_TV)
## [1] 4411
CDS_result_dWT_WT <- subset(CDS_result_dWT_WT, !is.na(CDS_result_dWT_WT$padj))
nrow(CDS_result_dWT_WT)
## [1] 4519
TUs_result_dWT_TV <- subset(TUs_result_dWT_TV, !is.na(TUs_result_dWT_TV$padj))
nrow(TUs_result_dWT_TV)
## [1] 3000
TUs_result_dWT_WT <- subset(TUs_result_dWT_WT, !is.na(TUs_result_dWT_WT$padj))
nrow(TUs_result_dWT_WT)
## [1] 3699

2.6 Create MA and Volcano plots for different comparisons, count features up-, downregulated

2.6.1 CDS

2.6.1.1 CDS rne(WT), rne(5p)

count_up_down(CDS_result_dWT_TV, foldchange=0.8, padjusted=0.05)
## [1] "number of features down:  36"
## [1] "number of features up:  32"
p <- volcanoPlot_ggplot(as.data.frame(CDS_result_dWT_TV), foldchange=0.8, padjusted=0.05, text=TRUE)
p
## Warning: Removed 4343 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 53 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS-dWT-TV_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")
## Warning: Removed 4343 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 63 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p <- MAplot_ggplot(CDS_result_dWT_TV, foldchange=0.8, y_axis_label = "Log2 fold-change(rne(WT)/rne(5p))")# + ylim(-5.5,+5.5)
p

ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS-dWT-TV_MAplot.pdf",plot=p, width=15, height=12, units="cm")

2.6.1.2 CDS rne(WT), WT

count_up_down(CDS_result_dWT_WT, foldchange=0.8, padjusted=0.05)
## [1] "number of features down:  76"
## [1] "number of features up:  87"
p <- volcanoPlot_ggplot(as.data.frame(CDS_result_dWT_WT), foldchange=0.8, padjusted=0.05, text=TRUE)
p
## Warning: Removed 4356 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 132 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS-dWT-WT_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")
## Warning: Removed 4356 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 147 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p <- MAplot_ggplot(CDS_result_dWT_WT, foldchange=0.8, y_axis_label = "Log2 fold-change(rne(WT)/WT)")# + ylim(-5.5,+5.5)
p

ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS-dWT-WT_MAplot.pdf",plot=p, width=15, height=12, units="cm")

2.6.2 TUs

2.6.2.1 TUs rne(WT), rne(5p)

count_up_down(TUs_result_dWT_TV, foldchange=0.8, padjusted=0.05)
## [1] "number of features down:  30"
## [1] "number of features up:  29"
p <- volcanoPlot_ggplot(as.data.frame(TUs_result_dWT_TV), foldchange=0.8, padjusted=0.05)
p

ggsave("output/DESeq2_Plots/TU/ddsMat_TUs-dWT-TV_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

p <- MAplot_ggplot(TUs_result_dWT_TV, foldchange=0.8, y_axis_label = "Log2 fold-change(rne(WT)/rne(5p))")# + ylim(-5.5,+5.5)
p

ggsave("output/DESeq2_Plots/TU/ddsMat_TUs-dWT-TV_MAplot.pdf",plot=p, width=15, height=12, units="cm")

2.6.2.2 TU rne(WT), WT

count_up_down(TUs_result_dWT_WT, foldchange=0.8, padjusted=0.05)
## [1] "number of features down:  52"
## [1] "number of features up:  106"
p <- volcanoPlot_ggplot(as.data.frame(TUs_result_dWT_WT), foldchange=0.8, padjusted=0.05)
p

ggsave("output/DESeq2_Plots/TU/ddsMat_TUs-dWT-WT_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")

p <- MAplot_ggplot(TUs_result_dWT_WT, foldchange=0.8, y_axis_label = "Log2 fold-change(rne(WT)/WT)")# + ylim(-5.5,+5.5)
p

ggsave("output/DESeq2_Plots/TU/ddsMat_TUs-dWT-WT_MAplot.pdf",plot=p, width=15, height=12, units="cm")

3 Exploratory Data Analysis

3.1 Create table how many RNA features of a certain type are affected by differential expression

features <- rtracklayer::import("input/20210217_syne_onlyUnique_withFeat.gff3")
CDS_features <- subset(features, features$type=="CDS")
TUs <- rtracklayer::import("input/Kopf_4091_TUs_combined.gff3")

types <- rep(c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc"),2)
updown <- c(rep("up",9), rep("down",9))
df_features <- data.frame(cbind(type=types, updown=updown))

types <- c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc")
for(t in types){
  t_feat <- subset(features, features$type==t)
  
  subset_CDS <- subset(CDS_result_dWT_TV, row.names(CDS_result_dWT_TV) %in% t_feat$locus_tag)
  
  index_up <- which(df_features$type==t & df_features$updown=="up")
  index_down <- which(df_features$type==t & df_features$updown=="down") 
  
  df_features[index_up,"number_feat_overlap"] <- nrow(subset(subset_CDS, subset_CDS$padj<0.05 & subset_CDS$log2FoldChange>0.8)) # count number of features affected
  df_features[index_down,"number_feat_overlap"] <- nrow(subset(subset_CDS, subset_CDS$padj<0.05 & subset_CDS$log2FoldChange<(-0.8))) # count number of features affected
  
  df_features[df_features$type==t,"number_total"] <- length(t_feat)
} 

df_features
##      type updown number_feat_overlap number_total
## 1     CDS     up                  13         3675
## 2    5UTR     up                   0          979
## 3    3UTR     up                   2           29
## 4    tRNA     up                   0           42
## 5    rRNA     up                   0            6
## 6   ncRNA     up                   7          318
## 7   asRNA     up                   9         1071
## 8  CRISPR     up                   1            3
## 9    misc     up                   0           11
## 10    CDS   down                  26         3675
## 11   5UTR   down                   1          979
## 12   3UTR   down                   0           29
## 13   tRNA   down                   0           42
## 14   rRNA   down                   2            6
## 15  ncRNA   down                   5          318
## 16  asRNA   down                   2         1071
## 17 CRISPR   down                   0            3
## 18   misc   down                   0           11
write.csv(df_features, file="output/RNAfeatures_upDown_dWT_TV.csv")
# prepare data.frame for barplot
types <- rep(c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc"),2)
updown <- c(rep("up",9), rep("down",9))
df_features <- data.frame(cbind(type=types, updown=updown))

types <- c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc")
for(t in types){
  t_feat <- subset(features, features$type==t)
  
  subset_CDS <- subset(CDS_result_dWT_WT, row.names(CDS_result_dWT_WT) %in% t_feat$locus_tag)
  
  index_up <- which(df_features$type==t & df_features$updown=="up")
  index_down <- which(df_features$type==t & df_features$updown=="down") 
  
  df_features[index_up,"number_feat_overlap"] <- nrow(subset(subset_CDS, subset_CDS$padj<0.05 & subset_CDS$log2FoldChange>0.8)) # count number of features affected
  df_features[index_down,"number_feat_overlap"] <- nrow(subset(subset_CDS, subset_CDS$padj<0.05 & subset_CDS$log2FoldChange<(-0.8))) # count number of features affected
  
  df_features[df_features$type==t,"number_total"] <- length(t_feat)
} 

df_features
##      type updown number_feat_overlap number_total
## 1     CDS     up                  72         3675
## 2    5UTR     up                   4          979
## 3    3UTR     up                   0           29
## 4    tRNA     up                   0           42
## 5    rRNA     up                   0            6
## 6   ncRNA     up                   3          318
## 7   asRNA     up                   5         1071
## 8  CRISPR     up                   3            3
## 9    misc     up                   0           11
## 10    CDS   down                  54         3675
## 11   5UTR   down                   6          979
## 12   3UTR   down                   0           29
## 13   tRNA   down                   0           42
## 14   rRNA   down                   0            6
## 15  ncRNA   down                  11          318
## 16  asRNA   down                   5         1071
## 17 CRISPR   down                   0            3
## 18   misc   down                   0           11
write.csv(df_features, file="output/RNAfeatures_upDown_dWT_WT.csv")

3.2 Functional Enrichment

go_functional_enrichment(CDS_result_dWT_TV, write=TRUE, path_up="output/enrichment/go_enrichment/dWT_TV_up.csv", path_down="output/enrichment/go_enrichment/dWT_TV_down.csv")
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue      geneID      Count      
## <0 rows> (or 0-length row.names)
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue      geneID      Count      
## <0 rows> (or 0-length row.names)
go_functional_enrichment(CDS_result_dWT_WT, write=TRUE, path_up="output/enrichment/go_enrichment/dWT_WT_up.csv", path_down="output/enrichment/go_enrichment/dWT_WT_down.csv")
##                    ID           Description GeneRatio BgRatio       pvalue
## GO:0008272 GO:0008272     sulfate transport      3/40 10/2432 0.0004569312
## GO:0004519 GO:0004519 endonuclease activity      4/40 24/2432 0.0005262848
##              p.adjust     qvalue                          geneID Count
## GO:0008272 0.01447283 0.01357261         slr1452/slr1453/slr1454     3
## GO:0004519 0.01447283 0.01357261 slr0393/slr1129/slr7068/slr7071     4
##                    ID                          Description GeneRatio BgRatio
## GO:0020037 GO:0020037                         heme binding      4/44 24/2432
## GO:0022904 GO:0022904 respiratory electron transport chain      3/44 12/2432
##                  pvalue  p.adjust     qvalue                          geneID
## GO:0020037 0.0007613007 0.0374427 0.03141645 sll0450/slr0898/slr1136/slr1137
## GO:0022904 0.0010852956 0.0374427 0.03141645         sll0450/slr1136/slr1137
##            Count
## GO:0020037     4
## GO:0022904     3
kegg_functional_enrichment(CDS_result_dWT_TV, write=TRUE, path_up="output/enrichment/kegg_enrichment/dWT_TV_up.csv", path_down="output/enrichment/kegg_enrichment/dWT_TV_down.csv")
## Reading KEGG annotation online:
## 
## Reading KEGG annotation online:
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue      geneID      Count      
## <0 rows> (or 0-length row.names)
## [1] ID          Description GeneRatio   BgRatio     pvalue      p.adjust   
## [7] qvalue      geneID      Count      
## <0 rows> (or 0-length row.names)
kegg_functional_enrichment(CDS_result_dWT_WT, write=TRUE, path_up="output/enrichment/kegg_enrichment/dWT_WT_up.csv", path_down="output/enrichment/kegg_enrichment/dWT_WT_down.csv")
##                ID       Description GeneRatio BgRatio       pvalue    p.adjust
## syn00920 syn00920 Sulfur metabolism      3/14  14/945 0.0008577528 0.009435281
##               qvalue                  geneID Count
## syn00920 0.007223182 slr1452/slr1453/slr1454     3
##                ID         Description GeneRatio BgRatio       pvalue
## syn00910 syn00910 Nitrogen metabolism      5/20  18/945 1.795269e-05
##              p.adjust       qvalue                                  geneID
## syn00910 0.0004308646 0.0003401562 sll0450/sll1450/sll1451/slr0898/slr0899
##          Count
## syn00910     5

3.3 Gene Set Enrichment Analyses

dWT_TV_go_gsea <- go_gsea(CDS_result_dWT_TV, write=TRUE, path="output/enrichment/go_gsea/dWT_TV_go_gsea.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
##                    ID                        Description setSize
## GO:0005840 GO:0005840                           ribosome      54
## GO:0003735 GO:0003735 structural constituent of ribosome      55
## GO:0006412 GO:0006412                        translation      89
## GO:0019843 GO:0019843                       rRNA binding      37
##            enrichmentScore       NES       pvalue    p.adjust      qvalues rank
## GO:0005840      -0.6390628 -2.295449 3.421905e-06 0.000508704 0.0004954709  782
## GO:0003735      -0.6311004 -2.269506 5.847173e-06 0.000508704 0.0004954709  782
## GO:0006412      -0.4883304 -1.952793 2.118945e-04 0.011935156 0.0116246826  993
## GO:0019843      -0.6347054 -2.107158 2.743714e-04 0.011935156 0.0116246826  856
##                              leading_edge
## GO:0005840 tags=63%, list=18%, signal=52%
## GO:0003735 tags=62%, list=18%, signal=52%
## GO:0006412 tags=47%, list=23%, signal=37%
## GO:0019843 tags=62%, list=19%, signal=51%
dWT_WT_go_gsea <- go_gsea(CDS_result_dWT_WT, write=TRUE, path="output/enrichment/go_gsea/dWT_WT_go_gsea.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
##                    ID                        Description setSize
## GO:0005840 GO:0005840                           ribosome      54
## GO:0003735 GO:0003735 structural constituent of ribosome      55
## GO:0019843 GO:0019843                       rRNA binding      37
## GO:0006412 GO:0006412                        translation      89
## GO:0003723 GO:0003723                        RNA binding      61
## GO:0004519 GO:0004519              endonuclease activity      24
##            enrichmentScore      NES       pvalue     p.adjust      qvalues rank
## GO:0005840       0.6623795 2.316038 2.294226e-07 4.014895e-05 3.477563e-05  480
## GO:0003735       0.6334220 2.215209 1.513803e-06 1.324578e-04 1.147303e-04  480
## GO:0019843       0.7021435 2.290831 5.304239e-06 3.094139e-04 2.680037e-04  471
## GO:0006412       0.5300910 2.042958 1.011786e-05 4.426562e-04 3.834135e-04 1072
## GO:0003723       0.5739667 2.048829 5.849954e-05 2.047484e-03 1.773460e-03  298
## GO:0004519       0.7017147 2.084257 1.213835e-04 3.244858e-03 2.810584e-03  698
##                              leading_edge
## GO:0005840 tags=44%, list=11%, signal=40%
## GO:0003735 tags=42%, list=11%, signal=38%
## GO:0019843 tags=49%, list=10%, signal=44%
## GO:0006412 tags=48%, list=24%, signal=38%
## GO:0003723  tags=18%, list=7%, signal=17%
## GO:0004519 tags=42%, list=15%, signal=35%
dWT_TV_kegg_gsea <- kegg_gsea(CDS_result_dWT_TV, write=TRUE, path="output/enrichment/kegg_gsea/dWT_TV_kegg_gsea.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
##                ID Description setSize enrichmentScore       NES       pvalue
## syn03010 syn03010    Ribosome      53      -0.6267786 -2.265002 1.351246e-05
##              p.adjust      qvalues rank                   leading_edge
## syn03010 0.0008107473 0.0008107473  774 tags=60%, list=18%, signal=50%
dWT_WT_kegg_gsea <- kegg_gsea(CDS_result_dWT_WT, write=TRUE, path="output/enrichment/kegg_gsea/dWT_WT_kegg_gsea.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
##                ID               Description setSize enrichmentScore       NES
## syn03010 syn03010                  Ribosome      53       0.6172389  2.148302
## syn00910 syn00910       Nitrogen metabolism      18      -0.7406120 -2.021652
## syn00920 syn00920         Sulfur metabolism      14       0.7610731  1.948010
## syn00190 syn00190 Oxidative phosphorylation      49      -0.5079952 -1.767313
##                pvalue     p.adjust      qvalues rank
## syn03010 1.234140e-05 0.0007528256 0.0006495475  480
## syn00910 8.109319e-05 0.0024733423 0.0021340313  899
## syn00920 1.511540e-03 0.0301079573 0.0259775300  648
## syn00190 1.974292e-03 0.0301079573 0.0259775300  978
##                            leading_edge
## syn03010 tags=40%, list=11%, signal=36%
## syn00910 tags=78%, list=20%, signal=63%
## syn00920 tags=57%, list=14%, signal=49%
## syn00190 tags=39%, list=22%, signal=31%

3.4 Plots of GSEA

gseaplot2(dWT_TV_go_gsea, geneSetID=1:4)

p <- gseaplot2(dWT_TV_kegg_gsea, geneSetID =1)
p

ggsave("output/enrichment/Plots/dWT_TV_KEGG.pdf", plot=p, width=15, height=12, units="cm")
gseaplot2(dWT_WT_go_gsea, geneSetID=1:9)

p <- gseaplot2(dWT_WT_kegg_gsea, geneSetID =1:4)
p

ggsave("output/enrichment/Plots/dWT_WT_KEGG.pdf", plot=p, width=15, height=12, units="cm")
browseKEGGNew_3(dWT_TV_kegg_gsea, "syn03010", 1) # 
browseKEGGNew_3(dWT_WT_kegg_gsea, "syn03010", 1) # 
browseKEGGNew_3(dWT_WT_kegg_gsea, "syn00910", 2) # 
browseKEGGNew_3(dWT_WT_kegg_gsea, "syn00920", 3) # 
browseKEGGNew_3(dWT_WT_kegg_gsea, "syn00190", 4) # 

3.5 GSEA for RNA features

First, a data frame in which feature names are assigned to their feature type (CDS, ncRNA, …) has to be created and the respective info read in.

df_featureType <- data.frame(feature_type=as.character(features$type), feature_name=features$locus_tag)

feature_type_gsea <- function(DESEq2_dataframe, write=FALSE, path=""){
  locus_tags <- row.names(DESEq2_dataframe)
  geneList <- DESEq2_dataframe$log2FoldChange
  names(geneList) <- locus_tags
  geneList = sort(geneList, decreasing = TRUE)
  
  set.seed(42)
  go_gsea_object <- GSEA(geneList, TERM2GENE = df_featureType, seed=TRUE)
  print(head(go_gsea_object)[,1:10])
  
  if(write){
    write.csv(go_gsea_object, path)
  }
  return(go_gsea_object)
}

features_gsea_dWT_TV <- feature_type_gsea(CDS_result_dWT_TV, TRUE, "output/enrichment/GSEA_RNAfeatures_dWT-TV.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
##          ID Description setSize enrichmentScore      NES       pvalue
## 3UTR   3UTR        3UTR      22       0.7332143 2.140823 1.025791e-05
## ncRNA ncRNA       ncRNA     224       0.3159062 1.420087 9.746224e-03
##           p.adjust      qvalues rank                   leading_edge
## 3UTR  4.103165e-05 2.159561e-05  254  tags=55%, list=6%, signal=52%
## ncRNA 1.949245e-02 1.025918e-02  672 tags=29%, list=15%, signal=26%
features_gsea_dWT_WT <- feature_type_gsea(CDS_result_dWT_WT, TRUE, "output/enrichment/GSEA_RNAfeatures_dWT-WT.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
##          ID Description setSize enrichmentScore       NES       pvalue
## ncRNA ncRNA       ncRNA     236      -0.3814923 -1.693676 0.0000167076
## 3UTR   3UTR        3UTR      23      -0.5893611 -1.746820 0.0048635827
## tRNA   tRNA        tRNA      36      -0.4795011 -1.542110 0.0175544830
## 5UTR   5UTR        5UTR     310      -0.2801913 -1.286619 0.0181392166
##           p.adjust qvalues rank                   leading_edge
## ncRNA 6.683038e-05      NA  948 tags=39%, list=21%, signal=33%
## 3UTR  9.727165e-03      NA  547 tags=48%, list=12%, signal=42%
## tRNA  1.813922e-02      NA  244  tags=25%, list=5%, signal=24%
## 5UTR  1.813922e-02      NA  880 tags=28%, list=19%, signal=25%
p <- gseaplot2(features_gsea_dWT_TV, geneSetID=1:2)
p

ggsave("output/enrichment/Plots/dWT_TV_RNAfeatures.pdf", plot=p, width=15, height=12, units="cm")
p <- gseaplot2(features_gsea_dWT_WT, geneSetID=1:4)
p

ggsave("output/enrichment/Plots/dWT_WT_RNAfeatures.pdf", plot=p, width=15, height=12, units="cm")

3.6 GSEA for base means

go_gsea_baseMeans <- function(DESEq2_dataframe, write=FALSE, path=""){
  locus_tags <- row.names(DESEq2_dataframe)
  geneList <- DESEq2_dataframe$baseMean
  names(geneList) <- locus_tags
  geneList = sort(geneList, decreasing = TRUE)
  
  set.seed(42)
  go_gsea_object <- GSEA(geneList, TERM2GENE = term_to_gene, TERM2NAME=term_to_name, seed=TRUE)
  tryCatch({
    print(head(go_gsea_object)[,1:10])
  
  if(write){
    write.csv(go_gsea_object, path)
  }
  return(go_gsea_object)}, error=function(e){
    print("nothing enriched")
  })
}

kegg_gsea_baseMean <- function(DESEq2_dataframe, write=FALSE, path=""){
  locus_tags <- row.names(DESEq2_dataframe)
  geneList <- DESEq2_dataframe$baseMean
  names(geneList) <- locus_tags
  geneList = sort(geneList, decreasing = TRUE)
  
  set.seed(42)
  kegg_gsea_object <- gseKEGG(geneList, organism="syn", minGSSize=10, pvalueCutoff = 0.05, seed=TRUE)
  tryCatch({
    print(head(kegg_gsea_object)[,1:10])
  
  if(write){
    write.csv(kegg_gsea_object, path)
  }
  return(kegg_gsea_object)}, error=function(e){
    print("nothing enriched")
  })
}

3.7 When taking width of features into account

norm_counts <- counts(ddsMat_CDS, normalized=TRUE)
mean_norm_counts <- apply(norm_counts, 1, mean)
mean_norm_counts_CDS <- subset(mean_norm_counts, names(mean_norm_counts) %in% CDS_features$locus_tag)
CDS_feat_df <- as.data.frame(CDS_features)
row.names(CDS_feat_df) <- CDS_feat_df$locus_tag
mean_norm_counts_CDS_width <- mean_norm_counts_CDS/CDS_feat_df[names(mean_norm_counts_CDS),]$width
mean_norm_counts_CDS_width <- sort(mean_norm_counts_CDS_width, decreasing=TRUE)
set.seed(42)
go_gsea_baseMeans_width <- GSEA(mean_norm_counts_CDS_width, TERM2GENE=term_to_gene, TERM2NAME = term_to_name, nPermSimple=10000, seed=TRUE)
## preparing geneSet collections...
## GSEA analysis...
## Warning in fgseaMultilevel(...): There were 3 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values. For such pathways pval, padj, NES, log2err are set to NA. You
## can try to increase the value of the argument nPermSimple (for example set it
## nPermSimple = 100000)
## no term enriched under specific pvalueCutoff...
set.seed(42)
kegg_gsea_baseMeans_width <- gseKEGG(mean_norm_counts_CDS_width, organism="syn", minGSSize=10, pvalueCutoff = 0.05, nPermSimple=10000, seed=TRUE)
## preparing geneSet collections...
## GSEA analysis...
## no term enriched under specific pvalueCutoff...

4 GSEA for plasmids

First, a data frame in which feature names are assigned to their feature type (CDS, ncRNA, …) has to be created and the respective info read in.

df_plasmid <- data.frame(plasmid=as.character(seqnames(TUs)), TU_ID=TUs$index)

plasmid_gsea <- function(DESEq2_dataframe, write=FALSE, path=""){
  TU_IDs <- row.names(DESEq2_dataframe)
  geneList <- DESEq2_dataframe$log2FoldChange
  names(geneList) <- TU_IDs
  geneList = sort(geneList, decreasing = TRUE)
  
  set.seed(42)
  go_gsea_object <- GSEA(geneList, TERM2GENE = df_plasmid, seed=TRUE)
  print(head(go_gsea_object)[,1:10])
  
  if(write){
    write.csv(go_gsea_object, path)
  }
  return(go_gsea_object)
}

plasmid_gsea_dWT_TV <- plasmid_gsea(TUs_result_dWT_TV, TRUE, "output/enrichment/GSEA_plasmids_dWT-TV.csv")
## preparing geneSet collections...
## GSEA analysis...
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
##                    ID Description setSize enrichmentScore       NES
## AP004310.1 AP004310.1  AP004310.1      85      -0.6146525 -2.559085
## AP004311.1 AP004311.1  AP004311.1      73       0.6089396  2.328756
##                  pvalue     p.adjust      qvalues rank
## AP004310.1 1.000000e-10 4.000000e-10 2.105263e-10  348
## AP004311.1 1.104233e-08 2.208465e-08 1.162350e-08  408
##                              leading_edge
## AP004310.1 tags=38%, list=12%, signal=34%
## AP004311.1 tags=47%, list=14%, signal=41%
plasmid_gsea_dWT_WT <- plasmid_gsea(TUs_result_dWT_WT, TRUE, "output/enrichment/GSEA_plasmids_dWT-WT.csv")
## preparing geneSet collections...
## GSEA analysis...
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
##                    ID Description setSize enrichmentScore      NES       pvalue
## AP004310.1 AP004310.1  AP004310.1     114       0.7903643 2.956229 1.000000e-10
## AP004311.1 AP004311.1  AP004311.1      90       0.9072201 3.281283 1.000000e-10
## AP006585.1 AP006585.1  AP006585.1      40       0.7232452 2.294462 1.580906e-07
##                p.adjust      qvalues rank                   leading_edge
## AP004310.1 2.000000e-10 5.263158e-11  453 tags=74%, list=12%, signal=67%
## AP004311.1 2.000000e-10 5.263158e-11  217  tags=86%, list=6%, signal=83%
## AP006585.1 2.107875e-07 5.547039e-08  398 tags=57%, list=11%, signal=52%
p <- gseaplot2(plasmid_gsea_dWT_TV, geneSetID=1:2)
p

ggsave("output/enrichment/Plots/dWT_TV_plasmids.pdf", plot=p, width=15, height=12, units="cm")
p <- gseaplot2(plasmid_gsea_dWT_WT, geneSetID=1:3)
p

ggsave("output/enrichment/Plots/dWT_WT_plasmids.pdf", plot=p, width=15, height=12, units="cm")

Session info

## R version 4.0.4 (2021-02-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 11 (bullseye)
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=de_DE.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=de_DE.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=de_DE.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] forcats_0.5.1               stringr_1.4.0              
##  [3] dplyr_1.0.5                 purrr_0.3.4                
##  [5] readr_1.4.0                 tidyr_1.1.3                
##  [7] tibble_3.1.0                tidyverse_1.3.0            
##  [9] rtracklayer_1.50.0          enrichplot_1.10.2          
## [11] clusterProfiler_3.18.1      RColorBrewer_1.1-2         
## [13] pheatmap_1.0.12             ggpubr_0.4.0               
## [15] ggrepel_0.9.1               ggplot2_3.3.3              
## [17] DESeq2_1.30.1               SummarizedExperiment_1.20.0
## [19] Biobase_2.50.0              MatrixGenerics_1.2.1       
## [21] matrixStats_0.58.0          GenomicRanges_1.42.0       
## [23] GenomeInfoDb_1.26.2         IRanges_2.24.1             
## [25] S4Vectors_0.28.1            BiocGenerics_0.36.0        
## [27] knitr_1.31                 
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1             shadowtext_0.0.7         backports_1.2.1         
##   [4] fastmatch_1.1-0          plyr_1.8.6               igraph_1.2.6            
##   [7] splines_4.0.4            BiocParallel_1.24.1      digest_0.6.27           
##  [10] htmltools_0.5.1.1        GOSemSim_2.16.1          viridis_0.5.1           
##  [13] GO.db_3.12.1             fansi_0.4.2              magrittr_2.0.1          
##  [16] memoise_2.0.0            openxlsx_4.2.3           Biostrings_2.58.0       
##  [19] annotate_1.68.0          graphlayouts_0.7.1       modelr_0.1.8            
##  [22] colorspace_2.0-0         rvest_1.0.0              blob_1.2.1              
##  [25] haven_2.3.1              xfun_0.22                crayon_1.4.1            
##  [28] RCurl_1.98-1.2           jsonlite_1.7.2           scatterpie_0.1.5        
##  [31] genefilter_1.72.1        survival_3.2-7           glue_1.4.2              
##  [34] polyclip_1.10-0          gtable_0.3.0             zlibbioc_1.36.0         
##  [37] XVector_0.30.0           DelayedArray_0.16.2      car_3.0-10              
##  [40] abind_1.4-5              scales_1.1.1             DOSE_3.16.0             
##  [43] DBI_1.1.1                rstatix_0.7.0            Rcpp_1.0.6              
##  [46] viridisLite_0.3.0        xtable_1.8-4             foreign_0.8-81          
##  [49] bit_4.0.4                httr_1.4.2               fgsea_1.16.0            
##  [52] ellipsis_0.3.1           pkgconfig_2.0.3          XML_3.99-0.5            
##  [55] farver_2.1.0             sass_0.3.1               dbplyr_2.1.0            
##  [58] locfit_1.5-9.4           utf8_1.1.4               labeling_0.4.2          
##  [61] tidyselect_1.1.0         rlang_0.4.10             reshape2_1.4.4          
##  [64] AnnotationDbi_1.52.0     munsell_0.5.0            cellranger_1.1.0        
##  [67] tools_4.0.4              cachem_1.0.4             cli_2.3.1               
##  [70] downloader_0.4           generics_0.1.0           RSQLite_2.2.3           
##  [73] broom_0.7.5              evaluate_0.14            fastmap_1.1.0           
##  [76] yaml_2.2.1               bit64_4.0.5              fs_1.5.0                
##  [79] tidygraph_1.2.0          zip_2.1.1                ggraph_2.0.5            
##  [82] xml2_1.3.2               DO.db_2.9                rstudioapi_0.13         
##  [85] compiler_4.0.4           curl_4.3                 ggsignif_0.6.1          
##  [88] reprex_1.0.0             tweenr_1.0.1             geneplotter_1.68.0      
##  [91] bslib_0.2.4              stringi_1.5.3            highr_0.8               
##  [94] lattice_0.20-41          Matrix_1.3-2             vctrs_0.3.6             
##  [97] pillar_1.5.1             lifecycle_1.0.0          BiocManager_1.30.10     
## [100] jquerylib_0.1.3          data.table_1.14.0        cowplot_1.1.1           
## [103] bitops_1.0-6             qvalue_2.22.0            R6_2.5.0                
## [106] gridExtra_2.3            rio_0.5.26               MASS_7.3-53.1           
## [109] assertthat_0.2.1         withr_2.4.1              GenomicAlignments_1.26.0
## [112] Rsamtools_2.6.0          GenomeInfoDbData_1.2.4   hms_1.0.0               
## [115] grid_4.0.4               rmarkdown_2.7            rvcheck_0.1.8           
## [118] carData_3.0-4            ggforce_0.3.3            lubridate_1.7.10